#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date    : 2016-06-15 23:09:53
# @Author  : lichao (qingligongzi@163.com)
# @Link    : http://www.qingligongzi.com
# @Version : 0.0.1

import urllib2
import re
import http_util
import sys

# solve UnicodeEncodeError: 'ascii' codec can't encode characters
reload(sys)
sys.setdefaultencoding("utf-8")

# current page
page = 4

# request url
url = "http://www.qiushibaike.com/8hr/page/" + str(page)

# headers
headers = http_util.headers

# do request
req = urllib2.Request(url, None, headers)
response = urllib2.urlopen(req)
content = response.read().decode("utf-8")

# print(content)

# parse html
pattern_str = '<div.*?class=.*?author.*?<a.*?<img.*?src="(.*?)".*?<a.*?href="/users.*?<h2>(.*?)</h2>' + \
    '.*?<div.*?content">(.*?)</div>(.*?)<span.*?stats-vote.*?number">(.*?)</i>.*?qiushi_comments.*?number">(.*?)</i>'
pattern = re.compile(pattern_str, re.S)
items = re.findall(pattern, content)
for item in items:
    # print item[3]
    # if item[3] != None and len(item[3].strip()) > 0:
    # 	print item[3]
    pattern_str = 'class="thumb.*?<img.*?src="(.*?)"'
    pattern = re.compile(pattern_str,re.S)
    sub_items = re.findall(pattern, item[3])
    if len(sub_items) > 0:
    	content_img_url = sub_items[0]
    else:
    	content_img_url = ''
    print("头像url:{0}\n昵称:{1}\n内容:\n{2}\n内容url:{3}\n好笑数:{4}\n评论数:{5}".format(
        item[0], item[1], item[2].replace('<br/>', '\n').strip(), content_img_url, item[4], item[5]))
    print("-" * 20)
